1 1. Setup

1.1 conda env on cluster

# Create env on cluster with mamba
mamba create -y \
  -n fst_env_rhel \
  -c bioconda gatk4
conda activate fst_env_rhel
mamba install bcftools plink2 r-base r-essentials r-tidyverse r-units libgdal r-sf
# Export
conda env export \
  --no-builds \
  -f envs/fst_env_rhel.yml
# Activate
conda activate fst_env_rhel

1.2 1.1. renv

# Export env (to renv.lock file)
renv::init()
# To install packages on new system, or 'activate' the env: 
renv::restore()

Load libaries

library(here)
library(tidyverse)
library(pegas)
library(knitr)
library(plotly)
library(ggridges)

1.3 1.3. Download 1GK data

1.3.1 1.3.1. Download from FTP

wget \
  -r -p -k \
  --no-parent \
  -cut-dirs=5 \
  ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/

1.3.2 1.3.2. Put list of files into list

find vcfs/ftp.1000genomes.ebi.ac.uk/ALL.chr*.vcf.gz \
  > human_traits_fst/data/20200205_vcfs.list

1.3.3 1.3.3. Merge VCFs

java -jar /nfs/software/birney/picard-2.9.0/picard.jar MergeVcfs \
  I=human_traits_fst/data/20200205_vcfs.list \
  O=vcfs/1gk_all.vcf.gz
# Exception in thread "main" java.lang.IllegalArgumentException: The contig entries in input file /hps/research1/birney/users/ian/rac_hyp/vcfs/ftp.1000genomes.ebi.ac.uk/ALL.chrMT.phase3_callmom-v0_4.20130502.genotypes.vcf.gz are not compatible with the others.

# So remove that one from list above
sed -i '/MT/d' human_traits_fst/data/20200205_vcfs.list

# run MergeVCFs again
java -jar /nfs/software/birney/picard-2.9.0/picard.jar MergeVcfs \
  I=human_traits_fst/data/20200205_vcfs.list \
  O=vcfs/1gk_all.vcf.gz
  
# Exception in thread "main" java.lang.IllegalArgumentException: The contig entries in input file /hps/research1/birney/users/ian/rac_hyp/vcfs/ftp.1000genomes.ebi.ac.uk/ALL.chrY.phase3_integrated_v2a.20130502.genotypes.vcf.gz are not compatible with the others.
sed -i '/chrY/d' human_traits_fst/data/20200205_vcfs.list

# run MergeVCFs again
java -jar /nfs/software/birney/picard-2.9.0/picard.jar MergeVcfs \
  I=human_traits_fst/data/20200205_vcfs.list \
  O=vcfs/1gk_all.vcf.gz
# SUCCESS

1.4 1.4. Obtain GWAS data

1.4.1 1.4.1. Educational attainment

From Lee et al. (2019) Gene discovery and polygenic prediction from a genome-wide association study of educational attainment in 1.1 million individuals, Nature: https://www.nature.com/articles/s41588-018-0147-3.

Data download link: https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-018-0147-3/MediaObjects/41588_2018_147_MOESM3_ESM.xlsx

Saved here: data/20180723_Lee-et-al_supp-tables.xlsx

Notes: - European-ancestry cohort only. - n = 1,131,881 individuals. - SNP panel: ~10M from 1GKph3. - Phenotype: number of years of schooling that individuals completed (EduYears). - Identified 1271 independent SNPs at the genome-wide significance level of 5e-8 (or 995 at P < 1e-8). - COJO: reduced to 765 at P < 5e-8: https://doi.org/10.1038/ng.2213 - Corrected by LD score regression: only ~5% of inflation attributable to bias. LD score intercept of 1.11.

1.4.2 1.4.2. Height

From Yengo et al. (2018) Meta-analysis of genome-wide association studies for height and body mass index in approximately 700000 individuals of European ancestry, Human Molecular Genetics: https://academic.oup.com/hmg/article-abstract/27/20/3641/5067845.

Notes: - European-ancestry cohort only. - Meta-analysis combining GIANT and UKBioBank cohorts from Wood et al. (2014) https://doi.org/10.1038/ng.3097 (height) and Locke et al. (2015) https://doi.org/10.1038/nature14177 (BMI). - n = Up to 795,624 individuals (456426 from UKBioBank and the rest (339198?) from GIANT). - SNP panel: ~2.4M HM2 SNPs. - Identified 3290 (2388 main, 902 secondary; 712 loci) and 941 (656 main, 285 secondary; 536 loci) near-independent SNPs associated with height and BMI respectively, at a genome-wide significance threshold of P < 1e−8). - Analysed with COJO: https://doi.org/10.1038/ng.2213 - Corrected by LD score regression: LD score intercept of 1.48 and 1.03 for height and BMI respectively.

Data downloaded from this webpage: https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files Data download link: https://portals.broadinstitute.org/collaboration/giant/images/4/4b/Meta-analysis_Wood_et_al%2BUKBiobank_2018_top_3290_from_COJO_analysis.txt.gz

cd human_traits_fst/data
# download
wget https://portals.broadinstitute.org/collaboration/giant/images/4/4b/Meta-analysis_Wood_et_al%2BUKBiobank_2018_top_3290_from_COJO_analysis.txt.gz
# unzip
gunzip Meta-analysis_Wood_et_al%2BUKBiobank_2018_top_3290_from_COJO_analysis.txt.gz
# rename
mv Meta-analysis_Wood_et_al+UKBiobank_2018_top_3290_from_COJO_analysis.txt 20181015_Yengo-et-al_snps_height.txt

Saved here: data/20181015_Yengo-et-al_snps_height.txt

1.4.3 BMI

From Yengo et al. (2018) Meta-analysis of genome-wide association studies for height and body mass index in approximately 700000 individuals of European ancestry, Human Molecular Genetics: https://academic.oup.com/hmg/article-abstract/27/20/3641/5067845. (Same study as above.)

Data download link: https://portals.broadinstitute.org/collaboration/giant/images/e/e2/Meta-analysis_Locke_et_al%2BUKBiobank_2018_top_941_from_COJO_analysis_UPDATED.txt.gz

# download
wget -P data https://portals.broadinstitute.org/collaboration/giant/images/e/e2/Meta-analysis_Locke_et_al%2BUKBiobank_2018_top_941_from_COJO_analysis_UPDATED.txt.gz
# unzip
gunzip data/Meta-analysis_Locke_et_al+UKBiobank_2018_top_941_from_COJO_analysis_UPDATED.txt.gz
# rename
mv data/Meta-analysis_Locke_et_al+UKBiobank_2018_top_941_from_COJO_analysis_UPDATED.txt data/20181015_Yengo-et-al_snps_bmi.txt

1.4.4 1.4.3. IBD

1.4.4.1 Huang et al. (2017)

From Huang et al. (2017) Fine-mapping inflammatory bowel disease loci to single-variant resolution, Nature: https://www.nature.com/articles/nature22969#Sec29.

Notes: - European-ancestry cohort - n = 67,852 individuals - Identified 94 loci - Take from list of credible sets those with “1” in signal column.

Data download link (Supplementary Table 1): https://static-content.springer.com/esm/art%3A10.1038%2Fnature22969/MediaObjects/41586_2017_BFnature22969_MOESM2_ESM.xlsx

Saved here: data/20170628_Huang-et-al_supp-table-1.xlsx

1.4.4.2 Jostins et al. (2012)

From Jostins et al. (2012) Host–microbe interactions have shaped the genetic architecture of inflammatory bowel disease, Nature: https://www.nature.com/articles/nature11582

Notes: - European-ancestry cohort - n = 41,660 individuals - SNP panel: 1.23M imputed SNPs from HM3 - Identified 193 independent SNPs associated with at least one of Crohn’s, UC, or IBD, at genome-wide significance of P < 5e-8, merged into 163 loci.

Data download link (Supplementary Table 2): https://static-content.springer.com/esm/art%3A10.1038%2Fnature11582/MediaObjects/41586_2012_BFnature11582_MOESM90_ESM.zip

Saved here: data/20121031_Jostins-et-al_supp-table-2.xlsx

1.4.5 1.4.4. Skin pigmentation

[NOTE: All other traits have SNP-hits based on only European-ancestry populations. Here, the SNP hits come from various populations. Potential bias? Or if we only had pigmentation SNP-hits from European populations, would we expect the Fst to be even more skewed?]{colour = “red”}

  • Crawford et al. (2017) Loci associated with skin pigmentation identified in African populations, Science: https://doi.org/10.1126/science.aan8433. Take table 1, and manually transform ‘Ancestral>Divided’ column into new columns titled ‘tested_allele’ (the allele in bold) and ‘other_allele’. Save here: data/20171117_Crawford-et-al_Table-1.xlsx
  • Adhikari et al. (2019) A GWAS in Latin Americans highlights the convergent evolution of lighter skin pigmentation in Eurasia, Nature: https://doi.org/10.1038/s41467-018-08147-0. “Summary statistics from the GWAS analyses is deposited at GWAS central with the link http://www.gwascentral.org/study/HGVST3308”. Under the ‘Association results’ tab, there is one dataset for each of the 6 phenotypes tested:
    • Melanin index
    • Hair color
    • Eye color
    • Digital eye color: L (lightness)
    • Digital eye color: C (chroma)
    • Digital eye color: cosH (cosine of hue) Difficult to ascertain which ones had genome-wide significance. Instead, pull tables directly from paper and supplementary materials, and put here in different sheets: data/20190121_Adhikari-et-al_snps.xlsx
    • Table 1: 18 lead SNPs from paper, each with a different p-value for one of the 6 phenotypes.
    • supp_table_6: 11 SNPs associated with combined traits.
    • supp_table_12: 161 SNPs collagted from published association studies on pigmentation. See table for references, which were used to identify other pigmentation GWAS studies.
  • Hernandez-Pacheco et al. (2017) Identification of a novel locus associated with skin colour in African-admixed populations, Scientific Reports: https://doi.org/10.1038/srep44548. 9 hits with genome-wide significance here: data/20170316_Hernandez-Pacheco-et-al.xlsx.

Others compiled into the single XLSX doc data/20200622_pigmentation_snps.xlsx:

1.4.5.1 Create list of target SNPs

pig_snps <- list()
# Crawford
pig_snps[["crawford"]] <- readxl::read_excel(here("data", "20171117_Crawford-et-al_Table-1.xlsx")) %>% 
  dplyr::select(rsid = "RSID", everything()) %>% 
  dplyr::filter(!is.na(rsid),
                !rsid == ".")

# Adhikari
pig_snps[["adhikari_tbl_1"]] <- readxl::read_excel(here("data", "20190121_Adhikari-et-al_snps.xlsx"),
                                                   sheet = "Table 1",
                                                   skip = 1) %>% 
  dplyr::select(rsid = "rsID", everything()) %>% 
  dplyr::filter(!is.na(rsid))

pig_snps[["adhikari_supp_6"]] <- readxl::read_excel(here("data", "20190121_Adhikari-et-al_snps.xlsx"),
                                                   sheet = "supp_table_6") %>% 
  dplyr::select(rsid = "SNP", everything()) %>% 
  dplyr::filter(!is.na(rsid))  

pig_snps[["adhikari_supp_12"]] <- readxl::read_excel(here("data", "20190121_Adhikari-et-al_snps.xlsx"),
                                                   sheet = "supp_table_12") %>% 
  dplyr::select(rsid = "SNP", everything()) %>% 
  dplyr::filter(!is.na(rsid))

# Hernandez-Pacheco
pig_snps[["hernandez-pacheco"]] <- readxl::read_excel(here("data", "20170316_Hernandez-Pacheco-et-al.xlsx"))

# Doc with SNPs from multiple studies
sheet_names <- readxl::excel_sheets(here("data", "20200622_pigmentation_snps.xlsx"))
compiled_snps <- lapply(sheet_names, function(x){
  x <- readxl::read_excel(here("data", "20200622_pigmentation_snps.xlsx"),
                     sheet = x)
})
names(compiled_snps) <- sheet_names

# Combine
pig_snps <- c(pig_snps, compiled_snps)

1.4.5.2 1.4.5.1. Pull out unique SNPs

pig_df <- lapply(pig_snps, function(x) dplyr::select(x, rsid))
pig_df <- dplyr::bind_rows(pig_df)
pig_df <- unique(pig_df)

nrow(pig_df)

[1] 266

knitr::kable(head(pig_df))
rsid
rs2413887
rs1426654
rs1834640
rs2675345
rs8028919
rs10424065

1.5 1.5. Filter 1GK VCF for target SNPs

1.5.1 Create lists of target SNPs

1.5.1.1 EDU

## extract from excel doc
snps_eduyrs <- readxl::read_xlsx(here::here("data", "20180723_Lee-et-al_supp-tables.xlsx"), sheet = "2. EduYears Lead SNPs", skip = 1, n_max = 1271)
## write table of SNPs
write.table(snps_eduyrs[["SNP"]], here::here("data", "snps_edu.list"), quote = F, row.names = F, col.names = F)

1.5.1.2 HEI

cut -f1 human_traits_fst/data/20181015_Yengo-et-al_snps_height.txt | tail -n+2 \
  > human_traits_fst/data/snps_hei.list

1.5.1.3 BMI

cut -f1 human_traits_fst/data/20181015_Yengo-et-al_snps_bmi.txt | tail -n+2 \
  > human_traits_fst/data/snps_bmi.list

1.5.1.4 IBD

# extract from excel doc
snps_ibd <- readxl::read_xlsx(path = here::here("data", "20170628_Huang-et-al_supp-table-1.xlsx"),
                      sheet = "list of credible sets") %>% 
  dplyr::filter(signal == 1) %>% # take first-ranked SNP in locus
  dplyr::filter(!grepl("chr", variant.lead)) %>% # remove SNPs without rsID
  dplyr::select(variant.lead) # take just rsID
  
# write tables of SNPs
write.table(x = snps_ibd$variant.lead,
            file = here::here("data", "snps_ibd.list"),
            quote = F,
            row.names = F,
            col.names = F)

1.5.1.5 PIG

write.table(x = pig_df$rsid,
            file = here::here("data", "snps_pig.list"),
            quote = F,
            row.names = F,
            col.names = F)

1.5.2 Create filtered VCFs

traits=$(echo edu hei bmi ibd pig)

for trait in $(echo $traits ); do
  gatk SelectVariants \
    -R refs/hs37d5.fa.gz \
    -V vcfs/1gk_all.vcf.gz \
    --keep-ids human_traits_fst/data/snps_$trait.list \
    -O vcfs/snphits_$trait.vcf.gz ;
done    

1.6 1.6. Get allele frequencies with Plink2

1.6.1 1.6.1. Import 1GK metadata (for sample-population key)

Downloded via this page: http://www.internationalgenome.org/data Download link: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_sample_info.xlsx.

Saved here: data/20130606_sample_info.xlsx

meta = readxl::read_xlsx(here::here("data", "20130606_sample_info.xlsx"),
                          sheet = "Sample Info") %>%
  dplyr::select(Sample, Population, Gender)
knitr::kable(head(meta))
Sample Population Gender
HG00096 GBR male
HG00097 GBR female
HG00098 GBR male
HG00099 GBR female
HG00100 GBR female
HG00101 GBR male

1.6.2 1.6.2. Write population file for Plink2

write.table(meta[, 1:2],
            here::here("data", "plink2_sample_popn_key.txt"),
            quote = F,
            sep = "\t",
            row.names = F,
            col.names = F)

1.6.3 1.6.3. Set up directories on EBI cluster

mkdir -p human_traits_fst/data/20200622_plink2_alfreqs

traits=$(echo edu hei ibd pig)

for i in $(echo $traits); do
  mkdir -p human_traits_fst/data/20200622_plink2_alfreqs/$i; 
done   

1.6.4 1.6.4. Run Plink2

traits=$(echo edu hei bmi ibd pig)

# Get AF per population
for trait in $(echo $traits); do
  # make directory
  mkdir -p human_traits_fst/data/20200622_plink2_alfreqs/$trait ;
  # create allele-freq tables
  plink2 \
    --vcf vcfs/snphits_$trait.vcf.gz \
    --freq \
    --max-alleles 2 \
    --pheno iid-only human_traits_fst/data/plink2_sample_popn_key.txt \
    --loop-cats PHENO1 \
    --out human_traits_fst/data/20200622_plink2_alfreqs/$trait/$trait ;
done

# Get global AF
for trait in $(echo $traits); do
  # make directory
  mkdir -p human_traits_fst/data/20200622_plink2_alfreqs/$trait ;
  # create allele-freq tables
  bsub \
    -o log/alfreq_global_$trait.out \
    -e log/alfreq_global_$trait.err \
    """
    conda activate fst_env_rhel ;
    plink2 \
      --vcf vcfs/snphits_$trait.vcf.gz \
      --freq \
      --max-alleles 2 \
      --out human_traits_fst/data/20200622_plink2_alfreqs/$trait/$trait.all
    """;
done

2 Analysis

3 2. Analysis

3.1 Set plotting parameters

# Create factor levels for `trait`
trait_levels = c("edu", "hei", "bmi", "ibd", "pig")
# Create vector for recoding traits with full names
recode_vec = c("edu" = "Educational attainment",
               "hei" = "Height",
               "bmi" = "BMI",
               "ibd" = "IBD",
               "pig" = "Pigmentation")
# Colour palette
fill_pal = c("Educational attainment" = "#00afbb",
             "Height" = "#FFBF00",
             "BMI" = "#0bc166",
             "IBD" = "#360568",
             "Pigmentation" = "#fc4e07")

3.2 2.1. Compare allele frequencies

3.2.1 Histograms of the global MAF distributions (i.e. all 26 populations together)

# Get list of files
target_files = list.files(here::here("data", "20200622_plink2_alfreqs"),
                          pattern = "all.afreq",
                          recursive = T,
                          full.names = T)

# Read in data
alfreq_list_glob = lapply(target_files, function(file){
  trait_df = read.table(file,
                        header = T,
                        comment.char = "")

})

# Set names
names(alfreq_list_glob) = gsub(pattern = ".all.afreq",
                       replacement = "",
                       basename(target_files))

# Merge into single Df
alfreq_glob_df = dplyr::bind_rows(alfreq_list_glob, .id = "trait")

# Get major and minor alleles and frequency
alfreq_glob_df = alfreq_glob_df %>% 
  dplyr::mutate(MAJ = if_else(ALT_FREQS <= 0.5,
                              REF,
                              ALT),
                MIN = ifelse(ALT_FREQS <= 0.5,
                             ALT,
                             REF),
                MAF = ifelse(ALT_FREQS <= 0.5,
                             ALT_FREQS,
                             1 - ALT_FREQS))

# Make `trait` a factor
alfreq_glob_df$trait = factor(alfreq_glob_df$trait, levels = trait_levels)
# Recode traits for plotting
alfreq_glob_df$trait = dplyr::recode(alfreq_glob_df$trait, !!!recode_vec)

Plot

alfreq_glob_df %>% 
  ggplot(aes(MAF, fill = trait)) +
    geom_histogram() +
    scale_fill_manual(values = fill_pal) +
    facet_wrap(~trait, nrow = 2) +
    guides(fill = F) +
    theme_bw() +
    ggtitle("Global MAF distributions")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Save
ggsave(here("plots", "20210114_global_maf_distributions.svg"),
       device = "svg",
       units = "cm",
       dpi = 400,
       height = 12,
       width = 19.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3.2.2 2.1.1. Read in population frequency data

target_dirs <- list.dirs(here::here("data", "20200622_plink2_alfreqs"), recursive = F)

al_freq_lst <- lapply(target_dirs, function(x){
  target_files <- list.files(x, pattern = ".afreq", full.names = T)
  # read in data
  data_lst <- lapply(target_files, function(target_file){
    read.table(target_file,
               header = T,
               comment.char = "")
  })
  # fix names of populations
  names(data_lst) <- gsub(pattern = "edu.|hei.|bmi.|ibd.|pig.|.afreq",
                          replacement = "",
                          x = list.files(x, pattern = ".afreq"))
  return(data_lst)
})

# set names
names(al_freq_lst) <- basename(target_dirs)

3.2.3 2.1.2. Turn into single table for each pheno

al_freq_df <- lapply(al_freq_lst, function(pheno){
  out <- dplyr::bind_rows(pheno, .id = "population") %>% 
    tidyr::pivot_wider(id_cols = c(X.CHROM, ID, REF, ALT),
                       names_from = population,
                       values_from = ALT_FREQS)
})

3.2.4 2.1.3. Randomly swap minor allele

set.seed(65)
rdm_sds <- sample(1:100, 5)

counter <- 0
al_freq_df_shuff <- lapply(al_freq_df, function(pheno_df){
  counter <<- counter + 1
  # set seed
  set.seed(rdm_sds[counter])
  # select SNPs to swap (half of total)
  tgt_indcs <- sample(nrow(pheno_df), nrow(pheno_df) /2)
  # swap minor alleles
  pheno_df[tgt_indcs, 5:ncol(pheno_df)] <- 1 - pheno_df[tgt_indcs, 5:ncol(pheno_df)]
  # return pheno_df
  return(pheno_df)
})

3.2.5 Bind in to single data frame and save

final = dplyr::bind_rows(al_freq_df_shuff, .id = "phenotype")
final %>% 
  readr::write_tsv(here("data", "20210114_pheno_alfreqs.txt.gz"))

3.3 Set up negative controls

Pull out random SNPs with the same allele frequencies as the GWAS SNP-hits

3.3.1 Create table of variants with allele frequencies

With Plink2

# Per chromosome
for chr in $(seq 1 24) ; do
  # make directory
  mkdir -p big_data/20210114_alfreqs_all/by_chr/$chr;
  # create allele-freq tables
  bsub \
    -M 10000 \
    -o log/20210114_plink_alfreq_$chr.out \
    -e log/20210114_plink_alfreq_$chr.err \
    "plink2 \
      --vcf vcfs/1gk_all.vcf.gz \
      --freq \
      --chr $chr \
      --max-alleles 2 \
      --pheno iid-only human_traits_fst/data/plink2_sample_popn_key.txt \
      --loop-cats PHENO1 \
      --out big_data/20210114_alfreqs_all/by_chr/$chr/$chr ";
done 

3.4 Compare allele frequencies

3.4.1 Read in data

target_dirs <- list.dirs(here::here("data", "20200622_plink2_alfreqs"), recursive = F)

al_freq_lst <- lapply(target_dirs, function(x){
  target_files <- list.files(x, pattern = ".afreq", full.names = T)
  # read in data
  data_lst <- lapply(target_files, function(target_file){
    read.table(target_file,
               header = T,
               comment.char = "")
  })
  # fix names of populations
  names(data_lst) <- gsub(pattern = "edu.|hei.|bmi.|ibd.|pig.|.afreq",
                          replacement = "",
                          x = list.files(x, pattern = ".afreq"))
  return(data_lst)
})

# set names
names(al_freq_lst) <- basename(target_dirs)

# reorder for (1) height, (2) eduyears, (3) ibd, (4) pigmentation
al_freq_lst <- al_freq_lst[c(2, 1, 3, 4)]

3.4.2 Turn into single table for each pheno

al_freq_df <- lapply(al_freq_lst, function(pheno){
  out <- dplyr::bind_rows(pheno, .id = "population") %>% 
    tidyr::pivot_wider(id_cols = c(X.CHROM, ID, REF, ALT),
                       names_from = population,
                       values_from = ALT_FREQS)
})

3.4.3 Randomly swap minor allele

set.seed(65)
rdm_sds <- sample(1:100, 4)

counter <- 0
al_freq_df_shuff <- lapply(al_freq_df, function(pheno){
  counter <<- counter + 1
  # set seed
  set.seed(rdm_sds[counter])
  # select SNPs to swap (half of total)
  tgt_indcs <- sample(nrow(pheno), nrow(pheno) /2)
  # swap minor alleles
  pheno[tgt_indcs, 5:ncol(pheno)] <- 1 - pheno[tgt_indcs, 5:ncol(pheno)]
  # return pheno
  return(pheno)
})

3.4.4 Plot

# Set up titles vector
titles <- c("Height", "Educational Attainment", "Inflammatory Bowel Disease", "Pigmentation")

3.4.4.1 2D

3.4.4.1.1 YRI v CHS
counter <- 0
lapply(al_freq_df_shuff, function(pheno){
  counter <<- counter + 1
  ggplot(pheno,
         aes(YRI, CHS)) +
    geom_point(size = 0.5) +
    coord_fixed() +
    geom_smooth(se = F, colour = "red") +
    geom_abline(intercept = 0, slope = 1, colour = "blue") +
    xlab("Allele frequency in YRI") +
    ylab("Allele frequency in CHS") +
    labs(title = titles[counter])
})
## $edu

## 
## $bmi

## 
## $hei

## 
## $ibd

3.4.4.1.2 YRI v CEU
counter <- 0
lapply(al_freq_df_shuff, function(pheno){
  counter <<- counter + 1
  ggplot(pheno,
         aes(YRI, CEU)) +
    geom_point(size = 0.5) +
    coord_fixed() +
    geom_smooth(se = F, colour = "red") +
    geom_abline(intercept = 0, slope = 1, colour = "blue") +
    xlab("Allele frequency in YRI") +
    ylab("Allele frequency in CEU") +
    labs(title = titles[counter])
})
## $edu

## 
## $bmi

## 
## $hei

## 
## $ibd

3.4.4.2 3D

colourscales <- c("Viridis", "Hot", "Bluered", "Electric")
titles <- c("Height", "Educational Attainment", "IBD", "Skin/hair pigmentation")

counter <- 0
plts <- lapply(al_freq_df_shuff, function(pheno){
  counter <<- counter + 1
  # set graph resolution
  graph_reso <- 0.05
  # get lm for data
  loess_model <- loess(CEU ~ 0 + CHS + YRI, data = pheno)
  # set up axes
  axis_x <- seq(min(pheno$CHS), max(pheno$CHS), by = graph_reso)
  axis_y <- seq(min(pheno$YRI), max(pheno$YRI), by = graph_reso)
  # sample points
  lm_surface <- expand.grid(CHS = axis_x,
                            YRI = axis_y,
                            KEEP.OUT.ATTRS = F)
  lm_surface$CEU <- predict(loess_model, newdata = lm_surface)
  lm_surface <- reshape2::acast(lm_surface, YRI ~ CHS, value.var = "CEU")
  # create plot
  plt <- plot_ly(pheno,
                 x = ~CHS,
                 y = ~YRI,
                 z = ~CEU,
                 type = "scatter3d",
                 mode = "markers",
                 marker = list(size = 2),
                 text = pheno$ID) 
  plt <- add_trace(plt,
                   z = lm_surface,
              x = axis_x,
              y = axis_y,
              type = "surface",
              colorscale = colourscales[counter]) %>% 
    layout(title = titles[counter])
  return(plt)
})

plts$hei
plts$edu
plts$ibd
plts$pig
## NULL

3.5 Fst

3.5.1 Find target VCFs

# list target VCFs
target_vcfs <- list.files(here::here("data"),
                          pattern = glob2rx("snphits_*.gz"), 
                          full.names = T)

# filter for the four we want
target_vcfs <- target_vcfs[grep("eduyrs|height|ibd_full|pig", target_vcfs)]

3.5.2 With all populations

3.5.2.1 Get Fst stats

# Create raw list of variants
vcf_list_raw <- lapply(target_vcfs, function(vcf_file){
  vcf_out <- pegas::read.vcf(vcf_file)
})

# Create vector of populations
populations <- unlist(lapply(rownames(vcf_list_raw[[1]]), function(sample){
  meta$Population[meta$Sample == sample]
}))

# Generate Fst stats
fst_out_lst <- lapply(vcf_list_raw, function(pheno){
  as.data.frame(pegas::Fst(pheno, pop = populations))
})

# make rownames into separate column
fst_out_lst <- lapply(fst_out_lst, function(pheno){
  pheno$snp <- rownames(pheno)
  return(pheno)
})
names(fst_out_lst) <- titles

# bind into single DF
fst_out_df <- dplyr::bind_rows(fst_out_lst, .id = "phenotype")

# Set order of phenotypes
fst_out_df$phenotype <- factor(fst_out_df$phenotype, levels = c("Height", "Educational Attainment", "IBD", "Skin/hair pigmentation"))

head(fst_out_df)

3.5.2.2 Plot density

3.5.2.2.1 2D
ggplot(fst_out_df, aes(Fst, fill = phenotype)) +
  geom_density(alpha = 0.7) +
  labs(fill = "Phenotype") +
  ylab("Density") +
  theme_bw() +
  scale_fill_manual(values = c("#E7B800", "#00AFBB", "#360568", "#FC4E07"))

3.5.2.2.2 3D
# factorise 
fst_out_df$phenotype_3d <- factor(fst_out_df$phenotype,
                                    levels = c("Skin/hair pigmentation", "IBD", "Educational Attainment", "Height"))

ggplot() +
  geom_density_ridges2(data = fst_out_df,
                       mapping = aes(x = Fst, y = phenotype_3d, fill = phenotype_3d),
                       scale = 2) +
  scale_fill_manual(values = c("#FC4E07", "#360568", "#00AFBB", "#E7B800")) +
  ylab(label = NULL) +
  theme_bw() +
  guides(fill = guide_legend(reverse=T, 
                             title = "Phenotype")) +
  scale_y_discrete(expand = expand_scale(add = c(0.2, 2.3)))

3.5.2.3 Run Kolmogorov-Smirnov Tests

# Height v EA
ks.test(fst_out_df$Fst[fst_out_df$phenotype == "Height"],
        fst_out_df$Fst[fst_out_df$phenotype == "Educational Attainment"])
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  fst_out_df$Fst[fst_out_df$phenotype == "Height"] and fst_out_df$Fst[fst_out_df$phenotype == "Educational Attainment"]
## D = 0.039184, p-value = 0.1203
## alternative hypothesis: two-sided
# Height v IBD
ks.test(fst_out_df$Fst[fst_out_df$phenotype == "Height"],
        fst_out_df$Fst[fst_out_df$phenotype == "IBD"])
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  fst_out_df$Fst[fst_out_df$phenotype == "Height"] and fst_out_df$Fst[fst_out_df$phenotype == "IBD"]
## D = 0.086335, p-value = 1.111e-06
## alternative hypothesis: two-sided
# Height v Pigmentation
ks.test(fst_out_df$Fst[fst_out_df$phenotype == "Height"],
        fst_out_df$Fst[fst_out_df$phenotype == "Skin/hair pigmentation"])
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  fst_out_df$Fst[fst_out_df$phenotype == "Height"] and fst_out_df$Fst[fst_out_df$phenotype == "Skin/hair pigmentation"]
## D = 0.30629, p-value < 2.2e-16
## alternative hypothesis: two-sided

3.5.3 With just YRI, CEU, and CHS

3.5.3.1 Get Fst stats

# get samples from target popns only
target_popns <- which(populations %in% c("YRI", "CEU", "CHS"))
populations_3pop <- populations[target_popns]

vcf_list_raw_3pop <- lapply(vcf_list_raw, function(pheno){
  pheno[target_popns, ]
})

# Generate Fst stats
fst_out_lst_3pop <- lapply(vcf_list_raw_3pop, function(pheno){
  as.data.frame(pegas::Fst(pheno, pop = populations_3pop))
})

# make rownames into separate column
fst_out_lst_3pop <- lapply(fst_out_lst_3pop, function(pheno){
  pheno$snp <- rownames(pheno)
  return(pheno)
})
names(fst_out_lst_3pop) <- titles

# bind into single DF
fst_out_df_3pop <- dplyr::bind_rows(fst_out_lst_3pop, .id = "phenotype")
head(fst_out_df_3pop)
##            phenotype Fit        Fst Fis        snp
## rs780569      Height   1 0.10356157   1   rs780569
## rs34394051    Height   1 0.02428879   1 rs34394051
## rs11121177    Height   1 0.14444609   1 rs11121177
## rs4846010     Height   1 0.09716410   1  rs4846010
## rs78116078    Height   1 0.17017201   1 rs78116078
## rs10799615    Height   1 0.17098100   1 rs10799615

3.5.3.2 Plot density

3.5.3.2.1 2D
ggplot(fst_out_df_3pop, aes(Fst, fill = phenotype)) +
  geom_density(alpha = 0.7) +
  labs(fill = "Phenotype") +
  ylab("Density") +
  theme_bw() +
  scale_fill_manual(values = c("#E7B800", "#00AFBB", "#360568", "#FC4E07"))

3.5.3.2.2 3D
# factorise 
fst_out_df_3pop$phenotype_3d <- factor(fst_out_df$phenotype,
                                    levels = c("Skin/hair pigmentation", "IBD", "Educational Attainment", "Height"))

ggplot() +
  geom_density_ridges2(data = fst_out_df_3pop,
                       mapping = aes(x = Fst, y = phenotype_3d, fill = phenotype_3d),
                       scale = 2) +
  scale_fill_manual(values = c("#FC4E07", "#360568", "#00AFBB", "#E7B800")) +
  ylab(label = NULL) +
  theme_bw() +
  guides(fill = guide_legend(reverse=T, 
                             title = "Phenotype")) +
  scale_y_discrete(expand = expand_scale(add = c(0.2, 2.3)))

3.5.3.3 Run Kolmogorov-Smirnov Tests

# Height v EA
ks.test(fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Height"],
        fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Educational Attainment"])
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Height"] and fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Educational Attainment"]
## D = 0.025151, p-value = 0.6096
## alternative hypothesis: two-sided
# Height v IBD
ks.test(fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Height"],
        fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "IBD"])
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Height"] and fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "IBD"]
## D = 0.065523, p-value = 0.0005063
## alternative hypothesis: two-sided
# Height v Pigmentation
ks.test(fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Height"],
        fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Skin/hair pigmentation"])
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Height"] and fst_out_df_3pop$Fst[fst_out_df_3pop$phenotype == "Skin/hair pigmentation"]
## D = 0.30608, p-value < 2.2e-16
## alternative hypothesis: two-sided